A new fast food chain is seeing rapid expansion over the past couple of years. They are now trying to optimize their supply chain to ensure that there are no shortages of ingredients. For this, they’ve tasked their data science team to come up with a model that could predict the output of each food processing farm over the next few years. These predictions could further increase the efficiency of their current supply chain management systems.
○ date: The timestamp at which the yield of the food processing farm was measured ○ farm_id: The farm identifier that recognizes the farm food processing plant ○ ingredient_type: The type of ingredient being produced ○ yield: The yield of the plant in tons
○ farm_id: The farm identifier that recognizes the farm food processing plant ○ founding_year: They year when the operations commenced on the farm and food processing plant. ○ num_processing_plants: The number of processing plants present on the farm ○ farm_area: The area of the farm in square meters ○ farming_company: The company that owns the farms ○ deidentified_location: The location at which the farm is present
○ For each location where the farms are present, the weather data is also provided by timestamp
The evaluation metric used for this course would be the Root Mean Squared Error (RMSE)
from google.colab import drive
drive.mount('/content/drive')
!pip install category_encoders
import pandas as pd
from category_encoders import OrdinalEncoder
import re
import matplotlib.pyplot as plt
import numpy as np
from operator import itemgetter
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import StandardScaler
from itertools import groupby
from tqdm import tqdm_notebook
import seaborn as sns
from sklearn.model_selection import train_test_split
import seaborn as sns
from mlxtend.plotting import plot_learning_curves
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
def get_heatmap(data):
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(data.corr(),linewidths=0.1,cmap='viridis',annot=True,ax=ax,annot_kws={"size": 12})
plt.show()
def get_dist_plot(data,col):
sns.distplot(data[col])
plt.show()
def get_NaN_info(data):
return pd.DataFrame(data.dtypes,columns=['Type']).merge(pd.DataFrame(data.isnull().sum(),columns=['Total Cnt of NaN']).merge(pd.DataFrame(data.isnull().sum()/len(data)*100,columns=['Total % of NaN']),left_index=True,right_index=True),left_index=True,right_index=True).sort_values('Total % of NaN',ascending=False)
def inspect_data(data):
return pd.DataFrame({"Data Type":data.dtypes,"No of Levels":data.apply(lambda x: x.nunique(),axis=0),"% of Unique Levels":data.apply(lambda x: x.nunique()/len(x)*100,axis=0), "Levels":data.apply(lambda x: str(x.unique()),axis=0)}).sort_values("% of Unique Levels", ascending=False)
def get_levels_NaNs_combined_info(data,cnt):
return (inspect_data(data[get_NaN_info(data).head(cnt).index]).sort_index()).merge((get_NaN_info(data).head(cnt)).iloc[:,[1,2]],left_index=True,right_index=True)
def my_reset_index(df):
index_df = df.index.to_frame(index=False)
df = df.reset_index(drop=True)
return pd.merge(index_df, df, left_index=True, right_index=True)
def memory_usage(df):
return(round(df.memory_usage(deep=True).sum() / 1024 ** 2, 2))
def get_dict_list(list_inp,idx):
return dict((k, [v[1:idx] for v in itr]) for k, itr in groupby(
list_inp, itemgetter(0)))
farm_data=pd.read_csv('/content/drive/My Drive/farm_data.csv')
train_weather=pd.read_csv('/content/drive/My Drive/train_weather.csv')
train_data=pd.read_csv('/content/drive/My Drive/train_data.csv')
print('Memory used by farm_data:', memory_usage(farm_data), 'MB')
print('Memory used by train_weather:', memory_usage(train_weather), 'MB')
print('Memory used by train_data:', memory_usage(train_data), 'MB')
print('-------------------------------------------------------')
print(pd.concat([pd.DataFrame(farm_data.select_dtypes(include=[np.float64]).min(),columns=['Farm_Min']),\
pd.DataFrame(farm_data.select_dtypes(include=[np.float64]).max(),columns=['Farm_Max'])],axis=1))
print('-------------------------------------------------------')
print(pd.concat([pd.DataFrame(train_weather.select_dtypes(include=[np.float64]).min(),columns=['Trn_Weather_Min']),\
pd.DataFrame(train_weather.select_dtypes(include=[np.float64]).max(),columns=['Trn_Weather_Max'])],axis=1))
print('-------------------------------------------------------')
print(pd.concat([pd.DataFrame(train_data.select_dtypes(include=[np.float64]).min(),columns=['Trn_data_Min']),\
pd.DataFrame(train_data.select_dtypes(include=[np.float64]).max(),columns=['Trn_data_Max'])],axis=1))
print('-------------------------------------------------------')
train_data3=train_data.copy()
train_data3.date = pd.to_datetime(train_data3.date, infer_datetime_format=True)
train_data3.farm_id=train_data3.farm_id.astype('category')
train_data3.ingredient_type=train_data3.ingredient_type.astype('category')
train_data3['yield']=train_data3['yield'].astype(np.float32)
train_weather3=train_weather.copy()
train_weather3.timestamp = pd.to_datetime(train_weather3.timestamp, infer_datetime_format=True)
train_weather3.deidentified_location = train_weather3.deidentified_location.astype('category')
farm_data3=farm_data.copy()
farm_data3.deidentified_location = farm_data3.deidentified_location.astype('category')
farm_data3.farm_id = farm_data3.farm_id.astype('category')
farm_data3.farming_company = farm_data3.farming_company.astype('category')
train_weather3['temp_obs']=train_weather3['temp_obs'].astype(np.float16)
train_weather3['cloudiness']=train_weather3['cloudiness'].astype(np.float16)
train_weather3['wind_direction']=train_weather3['wind_direction'].astype(np.float16)
train_weather3['dew_temp']=train_weather3['dew_temp'].astype(np.float16)
train_weather3['pressure_sea_level']=train_weather3['pressure_sea_level'].astype(np.float16)
train_weather3['precipitation']=train_weather3['precipitation'].astype(np.float16)
train_weather3['wind_speed']=train_weather3['wind_speed'].astype(np.float16)
farm_data3['operations_commencing_year']=farm_data3['operations_commencing_year'].astype(np.float16)
farm_data3['num_processing_plants']=farm_data3['num_processing_plants'].astype(np.float16)
farm_data3['farm_area']=farm_data3['farm_area'].astype(np.float32)
print('Memory used by farm_data:', memory_usage(farm_data3), 'MB')
print('Memory used by train_weather:', memory_usage(train_weather3), 'MB')
print('Memory used by train_data:', memory_usage(train_data3), 'MB')
print('-------------------------------------------------------')
print(pd.concat([pd.DataFrame(farm_data3.select_dtypes(include=[np.float16,np.float32]).min(),columns=['Farm_Min']),\
pd.DataFrame(farm_data3.select_dtypes(include=[np.float16,np.float32]).max(),columns=['Farm_Max'])],axis=1))
print('-------------------------------------------------------')
print(pd.concat([pd.DataFrame(train_weather3.select_dtypes(include=[np.float16,np.float32]).min(),columns=['Trn_Weather_Min']),\
pd.DataFrame(train_weather3.select_dtypes(include=[np.float16,np.float32]).max(),columns=['Trn_Weather_Max'])],axis=1))
print('-------------------------------------------------------')
print(pd.concat([pd.DataFrame(train_data3.select_dtypes(include=[np.float16,np.float32]).min(),columns=['Trn_data_Min']),\
pd.DataFrame(train_data3.select_dtypes(include=[np.float16,np.float32]).max(),columns=['Trn_data_Max'])],axis=1))
print('-------------------------------------------------------')
del [farm_data,train_data,train_weather]
get_levels_NaNs_combined_info(train_data3,len(train_data3.columns))
train_data3.describe()
get_levels_NaNs_combined_info(train_weather3,len(train_weather3.columns))
train_weather3.tail(3)
any(train_weather3.loc[:,['timestamp','deidentified_location']].duplicated())
get_levels_NaNs_combined_info(farm_data3,len(farm_data3.columns))
farm_data3.tail(3)
#
fig,ax=plt.subplots(1,2,figsize=(10,4))
fig.tight_layout(pad=3)
df=farm_data3.groupby(['farming_company'])['farm_id'].size().sort_values(ascending=False)
df.plot.bar(ax=ax[0])
ax[0].set_title('Number of Farms in a Farming Company')
ax[0].set_xlabel(str.upper('farming_company'),fontsize=10)
df=farm_data3.groupby(['deidentified_location'])['farm_id'].size().sort_values(ascending=False)
df.plot.bar(ax=ax[1],color='darkorange')
ax[1].set_title('Number of Farms in a Deidentified Location')
ax[1].set_xlabel(str.upper('deidentified_location'),fontsize=10)
plt.show()
fig,ax=plt.subplots(1,2,figsize=(10,4))
fig.tight_layout(pad=3)
df=farm_data3.groupby(['farming_company'])['farm_area'].sum().sort_values(ascending=False)
df.plot.bar(ax=ax[0])
ax[0].set_title('Total Farm Area in a Farming Company')
ax[0].set_xlabel(str.upper('farming_company'),fontsize=10)
df=farm_data3.groupby(['deidentified_location'])['farm_area'].sum().sort_values(ascending=False)
df.plot.bar(ax=ax[1],color='darkorange')
ax[1].set_title('Total farm area in a Deidentified Location')
ax[1].set_xlabel(str.upper('deidentified_location'),fontsize=10)
plt.show()
train_weather2=train_weather3.copy()
train_weather2['month']=pd.DatetimeIndex(train_weather2['timestamp']).month
location_list=list(np.unique(train_weather2['deidentified_location']))
trn_weather_cols=train_weather3.columns
import matplotlib.pyplot as plt
for col_indx,col in enumerate(trn_weather_cols[2:]):
fig,ax=plt.subplots(figsize=(10,7))
plt.gca().set_prop_cycle('color',plt.cm.Spectral(np.linspace(0,1,16)))
for loc_indx,location in enumerate(location_list):
temp_df=pd.DataFrame(train_weather2[train_weather2['deidentified_location']==location].groupby(['month'])[col]\
.mean()).rename(columns={col:location+' :: '+col})
temp_df['month']=temp_df.index
plt.plot('month',temp_df.columns[0],data=temp_df,linewidth=1.7)
plt.xlabel('MONTH')
plt.ylabel(str.upper(col))
plt.legend(loc='center left',bbox_to_anchor=(1, 0.5))
plt.show()
#break
any(farm_data3['farm_id'].duplicated())
dup_fids=list(farm_data3[farm_data3['farm_id'].duplicated()]['farm_id'])
len(dup_fids)
farm_data3[farm_data3['farm_id'].isin(dup_fids)].sort_values('farm_id').shape
farm_data3[farm_data3['farm_id'].isin(dup_fids)].sort_values('farm_id').head()
train_data3.tail(3)
any(train_data3.loc[:,['farm_id','date','ingredient_type']].duplicated())
train_data3_grouped2=train_data3.groupby(['farm_id','date','ingredient_type'])['yield'].size().unstack()
train_data3_grouped2=train_data3_grouped2[(train_data3_grouped2['ing_w']>1)|(train_data3_grouped2['ing_x']>1)|(train_data3_grouped2['ing_y']>1)|\
(train_data3_grouped2['ing_z']>1)]
train_data3_grouped2.columns=[str(x) for x in list(train_data3_grouped2.columns)]
train_data3_grouped2=my_reset_index(train_data3_grouped2)
train_data3_grouped2.head()
mul_rec_fids=list(np.unique(train_data3_grouped2['farm_id']))
len(mul_rec_fids)
print(np.sort(dup_fids))
print(np.sort(mul_rec_fids))
train_data3_dups=train_data3[train_data3['farm_id'].isin(mul_rec_fids)]
train_data3_dups=train_data3_dups.sort_values(['farm_id','date','ingredient_type','yield'])
train3_dups_farm3_dups_df=train_data3_dups.merge(farm_data3[farm_data3['farm_id'].isin(dup_fids)].sort_values('farm_id'),left_on='farm_id',right_on='farm_id')
dup_farm_loc_list=list(train3_dups_farm3_dups_df.groupby(['farm_id','deidentified_location','farming_company','ingredient_type'])['ingredient_type'].size().index)
res = get_dict_list(dup_farm_loc_list,4)
res2={}
res3={}
for key,values in res.items():
res2[key]=get_dict_list(values,3)
print('---------------------------------------------------------------------------------------------')
for k,v in res2.items():
print(k)
for k2,v2 in v.items():
l=[]
for ing in v2:
l+=list(ing)
l1=np.unique(l[0::2])
l2=np.unique(l[1::2])
#print(k,)
print(' ',l2,'--->',k2,'--->',l1)
print('---------------------------------------------------------------------------------------------')
for fid in mul_rec_fids:
print('=========================================================================================')
print(train_data3_dups[(train_data3_dups['farm_id']==fid)&~(train_data3_dups.loc[:,['farm_id','date','ingredient_type']].duplicated())].tail(10))
print('-----------------------------------------------------------------------------------------')
print(train_data3_dups[(train_data3_dups['farm_id']==fid)&(train_data3_dups.loc[:,['farm_id','date','ingredient_type']].duplicated())].tail(10))
print('=========================================================================================')
farm_data3[farm_data3['farm_id'].isin(dup_fids)].sort_values(['farm_id','farm_area']).tail(10)
farm_data33=farm_data3[farm_data3['farm_id'].isin(dup_fids)].sort_values('farm_id')
farm_data33['ings']=['ing_w ing_x','dups_ing_w ing_y','dups_ing_w','ing_w','dups_ing_w ing_y ing_x','ing_w','dups_ing_w','ing_w',\
'dups_ing_w','ing_w','ing_w ing_x','dups_ing_w','ing_w','dups_ing_w','ing_w','dups_ing_w','ing_w ing_x ing_y',\
'dups_ing_w dups_ing_x ing_z','ing_w ing_x ing_z','dups_ing_w','dups_ing_w','ing_w','dups_ing_w','ing_w',\
'ing_w ing_x ing_z','dups_ing_w','dups_ing_w','ing_w','ing_w ing_x','dups_ing_w']
farm_data333=farm_data33.sort_values(['farm_id','ings'],ascending=False)
farm_data333.head(10)
indexes2={}
for fid in mul_rec_fids:
# taking row indexes of farms data from train dataset, all indexes excluding the duplicate record indexes
indexes2['nondup-'+fid]=list(train_data3_dups[(train_data3_dups['farm_id']==fid)&~(train_data3_dups.loc[:,['farm_id','date','ingredient_type']].duplicated())].index)
# taking row indexes of farms data from train dataset, only the duplicate record indexes
indexes2['dup-'+fid]=list(train_data3_dups[(train_data3_dups['farm_id']==fid)&(train_data3_dups.loc[:,['farm_id','date','ingredient_type']].duplicated())].index)
# from the above farm_data333 dataframe taking the non-dups_ings information for a particular farm id that are along with the dups_ings
# If there are no non-dups_ings for a farm id, we get empty list. Example: for fid_97094 we get empty, for fid_26064 we get
# ['ing_y', 'ing_x'] as we have dups_ings i.e dups_ing_w along with them
dups_loc_non_dups=[i for i in list(farm_data333[farm_data333['farm_id']==fid].tail(1)['ings'].values)[0].split() if 'dups_' not in i]
# taking row indexes of farms data from train dataset, all indexes of the non-dups_ings extracted above specific to
# a particular farm
if len(dups_loc_non_dups):
indexes2['dupslocnondup-'+fid]=list(train_data3_dups[(train_data3_dups['farm_id']==fid)&~(train_data3_dups.loc[:,['farm_id','date','ingredient_type']].duplicated())&(train_data3_dups['ingredient_type'].isin(dups_loc_non_dups))].index)
indexes2.keys()
train_data33=train_data3.copy()
#element is a variable to control the below flow of code
element=1
for key,idx_value in indexes2.items():
#print(key)
fid=key.split('-')[1]
if (key.split('-')[0]!='dupslocnondup')&(element==3):
# if there are no dupslocnondup- for a file id making element = 1 to control the below execution flow
element=1
if(element==1):
# this code is for nondup- indexes
# Extracting farming_company and deidentified_location particular a farm_id
farm_comp=farm_data333[farm_data333['farm_id']==fid].head(1)['farming_company'].values[0]
deid_loc=farm_data333[farm_data333['farm_id']==fid].head(1)['deidentified_location'].values[0]
train_data33.loc[(train_data33.index.isin(idx_value)),'farming_company']=farm_comp
train_data33.loc[(train_data33.index.isin(idx_value)),'deidentified_location']=deid_loc
prev_idx_value=idx_value
element=2
elif element==2:
# this code is for dup- indexes
farm_comp=farm_data333[farm_data333['farm_id']==fid].tail(1)['farming_company'].values[0]
deid_loc=farm_data333[farm_data333['farm_id']==fid].tail(1)['deidentified_location'].values[0]
train_data33.loc[(train_data33.index.isin(idx_value)),'farming_company']=farm_comp
train_data33.loc[(train_data33.index.isin(idx_value)),'deidentified_location']=deid_loc
element=3
else:
# this code is for dupslocnondup- indexes
farm_comp=farm_data333[farm_data333['farm_id']==fid].tail(1)['farming_company'].values[0]
deid_loc=farm_data333[farm_data333['farm_id']==fid].tail(1)['deidentified_location'].values[0]
train_data33.loc[(train_data33.index.isin(idx_value)),'farming_company']=farm_comp
train_data33.loc[(train_data33.index.isin(idx_value)),'deidentified_location']=deid_loc
element=1
train_data33.farming_company=train_data33.farming_company.astype('category')
train_data33.deidentified_location=train_data33.deidentified_location.astype('category')
train_data33.shape
train_data33[train_data33['farm_id'].isin(dup_fids)].tail()
train_data33[~(train_data33['farm_id'].isin(dup_fids))].tail()
train_data33_nondups=train_data33[~(train_data33['farm_id'].isin(mul_rec_fids))]
train_data33_nondups.shape
train_data33_nondups_farm3_df=train_data33_nondups.merge(farm_data3,left_on=['farm_id'],right_on=['farm_id'])
train_data33_nondups_farm3_df=train_data33_nondups_farm3_df.loc[:,['date', 'farm_id', 'ingredient_type', 'yield', 'farming_company_y',\
'deidentified_location_y']]
train_data33_nondups_farm3_df.columns=['date', 'farm_id', 'ingredient_type', 'yield', 'farming_company','deidentified_location']
train_data_final=pd.concat([train_data33_nondups_farm3_df,train_data33[train_data33['farm_id'].isin(mul_rec_fids)]],axis=0)
train_data_final.shape
train_data_final=train_data_final.sort_values(['farm_id','date','ingredient_type','yield'])
del [train3_dups_farm3_dups_df,train_data33_nondups,train_data3_grouped2,train_data33_nondups_farm3_df,]
del[train_data3_scaled]
del [train_data3_dups,train_data3,train_data33]
train_data_final.farming_company=train_data_final.farming_company.astype('category')
train_data_final.deidentified_location=train_data_final.deidentified_location.astype('category')
train_data_final.head(3)
farm3_weather3_df=farm_data3.merge(train_weather3,left_on=['deidentified_location'],right_on=['deidentified_location'])
farm3_weather3_df.shape
farm3_weather3_df.head(3)
pd.DataFrame(train_weather3.groupby(['deidentified_location']).size()).head(3)
pd.DataFrame(train_data_final.groupby(['deidentified_location','ingredient_type','farming_company','farm_id']).size()).head()
final_merged_df=train_data_final.merge(farm3_weather3_df,how='left',left_on=['farm_id','date','farming_company','deidentified_location'],\
right_on=['farm_id','timestamp','farming_company','deidentified_location'])
final_merged_df.shape
final_merged_df['day']=pd.DatetimeIndex(final_merged_df['date']).day
final_merged_df['day']=final_merged_df['day'].astype(np.uint8)
final_merged_df['month']=pd.DatetimeIndex(final_merged_df['date']).month
final_merged_df['month']=final_merged_df['month'].astype(np.uint8)
final_merged_df['hour']=final_merged_df['date'].dt.hour
final_merged_df['hour']=final_merged_df['hour'].astype(np.uint8)
final_merged_df_sc=final_merged_df.copy()
scaler=StandardScaler()
final_merged_df_sc[['yield']]=scaler.fit_transform(final_merged_df_sc[['yield']])
indexes=list(final_merged_df_sc[(final_merged_df_sc['yield']>=3)|(final_merged_df_sc['yield']<=-3)].index)
print(len(indexes))
indexes=list(final_merged_df_sc[final_merged_df_sc['yield']>=3].index)
print(len(indexes))
np.unique(final_merged_df_sc[final_merged_df_sc.index.isin(indexes)]['farm_id'])
final_merged_df[final_merged_df.index.isin(indexes)].groupby(['farm_id','ingredient_type']).size()
print('Avg Yield of fid_121183=',final_merged_df[final_merged_df['farm_id'].isin(['fid_121183'])].loc[:,'yield'].mean(),'tons')
print('Avg Yield of fid_72059=',final_merged_df[final_merged_df['farm_id'].isin(['fid_72059'])].loc[:,'yield'].mean(),)
print('Avg Yield of excluding possible outlier records of fid_121183=',final_merged_df[(final_merged_df['farm_id'].isin(['fid_121183']))\
&~(final_merged_df.index.isin(indexes))].loc[:,'yield'].mean(),'tons')
print('Avg Yield of excluding possible outlier records of fid_72059=',final_merged_df[(final_merged_df['farm_id'].isin(['fid_72059']))\
&~(final_merged_df.index.isin(indexes))].loc[:,'yield'].mean(),'tons')
Average yield of the outlier records of these 2 farms
print('Avg Yield of only possible outlier records of fid_121183=',final_merged_df[(final_merged_df['farm_id'].isin(['fid_121183']))&(final_merged_df.index.isin(indexes))].loc[:,'yield'].mean(),'tons')
print('Avg Yield of only possible outlier records of fid_72059=',final_merged_df[(final_merged_df['farm_id'].isin(['fid_72059']))&(final_merged_df.index.isin(indexes))].loc[:,'yield'].mean(),'tons')
print(581066.4375 /5962.3154296875 )
print(10588519.0 /3961.556396484375 )
print(pd.DataFrame(final_merged_df[(final_merged_df['farm_id'].isin(['fid_121183']))&(final_merged_df.index.isin(indexes))].loc[:,'yield']/97).mean())
print(pd.DataFrame(final_merged_df[(final_merged_df['farm_id'].isin(['fid_72059']))&(final_merged_df.index.isin(indexes))].loc[:,'yield']/2672).mean())
indexes1=list(final_merged_df_sc[(final_merged_df_sc['yield']>=3)&(final_merged_df['farm_id'].isin(['fid_121183']))].index)
print(len(indexes1))
# Indexes of possible outlier records of fid_72059
indexes2=list(final_merged_df_sc[(final_merged_df_sc['yield']>=3)&(final_merged_df['farm_id'].isin(['fid_72059']))].index)
print(len(indexes2))
final_merged_df2=final_merged_df.copy()
final_merged_df.loc[final_merged_df.index.isin(indexes1),'yield'].shape
final_merged_df.loc[final_merged_df.index.isin(indexes2),'yield'].shape
print(final_merged_df[(final_merged_df['farm_id'].isin(['fid_121183']))&(final_merged_df.index.isin(indexes))].loc[:,'yield'].mean())
print(final_merged_df[(final_merged_df['farm_id'].isin(['fid_72059']))&(final_merged_df.index.isin(indexes))].loc[:,'yield'].mean())
final_merged_df.loc[final_merged_df.index.isin(indexes1),'yield']=\
final_merged_df.loc[final_merged_df.index.isin(indexes1),'yield']/97
final_merged_df.loc[final_merged_df.index.isin(indexes2),'yield']=\
final_merged_df.loc[final_merged_df.index.isin(indexes2),'yield']/2672
print(final_merged_df[(final_merged_df['farm_id'].isin(['fid_121183']))].loc[:,'yield'].mean())
print(final_merged_df[(final_merged_df['farm_id'].isin(['fid_72059']))].loc[:,'yield'].mean())
final_merged_df['wind_speed']=final_merged_df['wind_speed'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['wind_direction']=final_merged_df['wind_direction'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['temp_obs']=final_merged_df['temp_obs'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['pressure_sea_level']=final_merged_df['pressure_sea_level'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['precipitation']=final_merged_df['precipitation'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['farm_area']=final_merged_df['farm_area'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['dew_temp']=final_merged_df['dew_temp'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['cloudiness']=final_merged_df['cloudiness'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['precipitation']=final_merged_df['precipitation'].interpolate(method ='linear', limit_direction ='backward')
final_merged_df.drop(['num_processing_plants'],axis=1,inplace=True)
final_merged_df.drop(['operations_commencing_year'],axis=1,inplace=True)
final_merged_df.drop(['timestamp'],axis=1,inplace=True)
fig, axes = plt.subplots(nrows=8, ncols=4, figsize=(17,22))
for ing_idx,ing in enumerate(['ing_w','ing_x','ing_y','ing_z']):
temp_dff=pd.DataFrame(final_merged_df[final_merged_df['ingredient_type']==ing]).\
groupby(['deidentified_location','month'])['yield'].sum().unstack().T
temp_dff.columns=[str(x) for x in list(temp_dff.columns)]
temp_dff=my_reset_index(temp_dff)
fig.tight_layout()
count=0
axes[count][ing_idx].set_title(ing,fontsize=15)
for idx,col in enumerate(temp_dff.columns[1:]):
axes[count][ing_idx].plot('month',col,data=temp_dff)
show='n'
if (idx+1)%2==0:
axes[count][ing_idx].legend()
show='y'
axes[count][ing_idx].set_xlabel('Month')
axes[count][ing_idx].set_ylabel('Total Yeild (tons)')
count+=1
if show=='n':
axes[count][ing_idx].legend()
axes[count][ing_idx].set_xlabel('Month')
axes[count][ing_idx].set_ylabel('Total Yeild (tons)')
fig, axes = plt.subplots(nrows=8, ncols=4, figsize=(17,22))
for ing_idx,ing in enumerate(['ing_w','ing_x','ing_y','ing_z']):
temp_dff=pd.DataFrame(final_merged_df[final_merged_df['ingredient_type']==ing]).\
groupby(['farming_company','month'])['yield'].sum().unstack().T
temp_dff.columns=[str(x) for x in list(temp_dff.columns)]
temp_dff=my_reset_index(temp_dff)
fig.tight_layout()
count=0
axes[count][ing_idx].set_title(ing,fontsize=15)
for idx,col in enumerate(temp_dff.columns[1:]):
axes[count][ing_idx].plot('month',col,data=temp_dff)
show='n'
if (idx+1)%2==0:
axes[count][ing_idx].legend()
show='y'
axes[count][ing_idx].set_xlabel('Month')
axes[count][ing_idx].set_ylabel('Total Yeild (tons)')
count+=1
if show=='n':
axes[count][ing_idx].legend()
axes[count][ing_idx].set_xlabel('Month')
axes[count][ing_idx].set_ylabel('Total Yeild (tons)')
final_merged_df.columns
deidentified_locations=np.unique(final_merged_df['deidentified_location'])
import matplotlib.pyplot as plt
for loc in deidentified_locations:
fig = plt.figure(figsize=(10,5))
host = fig.add_subplot(111)
par1 = host.twinx()
par2 = host.twinx()
par3 = host.twinx()
par4 = host.twinx()
par5 = host.twinx()
par6 = host.twinx()
par7 = host.twinx()
host.set_xlabel(str.upper("MONTH"),fontsize=12)
host.set_ylabel(str.upper("Yield"),fontsize=12)
par1.set_ylabel(str.upper("cloudiness"),fontsize=12)
par2.set_ylabel(str.upper("temp_obs"),fontsize=12)
par3.set_ylabel(str.upper("dew_temp"),fontsize=12)
par4.set_ylabel(str.upper("pressure_sea_level"),fontsize=12)
par5.set_ylabel(str.upper("precipitation"),fontsize=12)
par6.set_ylabel(str.upper("wind_speed"),fontsize=12)
par7.set_ylabel(str.upper("wind_direction"),fontsize=12)
months=np.unique(final_merged_df['month'])
yield1=pd.DataFrame(final_merged_df[(final_merged_df['deidentified_location']==loc)&(final_merged_df['ingredient_type']=='ing_w')].\
groupby(['month'])['yield'].sum())
cloudiness=pd.DataFrame(final_merged_df[(final_merged_df['deidentified_location']==loc)&(final_merged_df['ingredient_type']=='ing_w')].\
groupby(['month'])['cloudiness'].mean())
tempobs=pd.DataFrame(final_merged_df[(final_merged_df['deidentified_location']==loc)&(final_merged_df['ingredient_type']=='ing_w')].\
groupby(['month'])['temp_obs'].mean())
dew_temp=pd.DataFrame(final_merged_df[(final_merged_df['deidentified_location']==loc)&(final_merged_df['ingredient_type']=='ing_w')].\
groupby(['month'])['dew_temp'].mean())
pressure_sea_level=pd.DataFrame(final_merged_df[(final_merged_df['deidentified_location']==loc)&(final_merged_df['ingredient_type']=='ing_w')].\
groupby(['month'])['pressure_sea_level'].mean())
precipitation=pd.DataFrame(final_merged_df[(final_merged_df['deidentified_location']==loc)&(final_merged_df['ingredient_type']=='ing_w')].\
groupby(['month'])['precipitation'].mean())
wind_speed=pd.DataFrame(final_merged_df[(final_merged_df['deidentified_location']==loc)&(final_merged_df['ingredient_type']=='ing_w')].\
groupby(['month'])['wind_speed'].mean())
wind_direction=pd.DataFrame(final_merged_df[(final_merged_df['deidentified_location']==loc)&(final_merged_df['ingredient_type']=='ing_w')].\
groupby(['month'])['wind_direction'].mean())
p1, = host.plot(months,yield1,color='r',label="Yield",linewidth=4)
p2, = par1.plot(months,cloudiness,color='g',label="Cloudiness",linewidth=2)
p3, = par2.plot(months,tempobs,color='darkorange',label="Temp Obs",linewidth=2)
p4, = par3.plot(months,dew_temp,color='cyan',label="Dew Temp",linewidth=2)
p5, = par4.plot(months,pressure_sea_level,color='darkcyan',label="Pressure Sea Level",linewidth=2)
p6, = par5.plot(months,precipitation,color='b',label="Precipitation",linewidth=2)
p7, = par6.plot(months,wind_speed,color='m',label="Wind Speed",linewidth=2)
p8, = par7.plot(months,wind_direction,color='y',label="Wind Direction",linewidth=2)
lns = [p1, p2, p3, p4, p5, p6, p7, p8]
host.legend(handles=lns, loc='center left',bbox_to_anchor=(1.8, 0.5))
par2.spines['right'].set_position(('outward', 60))
par3.spines['right'].set_position(('outward', 100))
par4.spines['right'].set_position(('outward', 140))
par5.spines['right'].set_position(('outward', 200))
par6.spines['right'].set_position(('outward', 250))
par7.spines['right'].set_position(('outward', 300))
host.yaxis.label.set_color(p1.get_color())
par1.yaxis.label.set_color(p2.get_color())
par2.yaxis.label.set_color(p3.get_color())
par3.yaxis.label.set_color(p4.get_color())
par4.yaxis.label.set_color(p5.get_color())
par5.yaxis.label.set_color(p6.get_color())
par6.yaxis.label.set_color(p7.get_color())
par7.yaxis.label.set_color(p8.get_color())
plt.title('Graph for Total yield in "'+loc+'" w.r.t to its weather conditions for "ing_w"',fontsize=15)
plt.show()
#break
final_merged_df.columns
train_weather3.columns
df=final_merged_df.groupby(['deidentified_location'])['yield'].sum().sort_values(ascending=False)
df.plot.bar()
deid_list_oe=final_merged_df.groupby(['deidentified_location'])['yield'].sum().sort_values().index
deid_list_oe_dict = { deid_list_oe[i] : i+1 for i in range(0, len(deid_list_oe) ) }
print(deid_list_oe_dict)
df=final_merged_df.groupby(['farming_company'])['yield'].sum().sort_values(ascending=False)
df.plot.bar()
fc_list_oe=final_merged_df.groupby(['farming_company'])['yield'].sum().sort_values().index
fc_list_oe_dict = { fc_list_oe[i] : i+1 for i in range(0, len(fc_list_oe) ) }
print(fc_list_oe_dict)
final_merged_df.columns
fid_list_oe=final_merged_df.groupby(['farm_id'])['yield'].sum().sort_values().index
fid_list_oe_dict = { fid_list_oe[i] : i+1 for i in range(0, len(fid_list_oe) ) }
len(fid_list_oe_dict.keys())
final_merged_df_cols=list(final_merged_df.columns)
mappings=[{'col':'farming_company','mapping':fc_list_oe_dict}
]
oe=OrdinalEncoder(mapping=mappings)
final_merged_df=oe.fit_transform(final_merged_df)
final_merged_df=pd.DataFrame(final_merged_df,columns=final_merged_df_cols)
mappings=[{'col':'deidentified_location','mapping':deid_list_oe_dict}
]
oe=OrdinalEncoder(mapping=mappings)
final_merged_df=oe.fit_transform(final_merged_df)
final_merged_df=pd.DataFrame(final_merged_df,columns=final_merged_df_cols)
mappings=[{'col':'farm_id','mapping':fid_list_oe_dict}
]
oe=OrdinalEncoder(mapping=mappings)
final_merged_df=oe.fit_transform(final_merged_df)
final_merged_df=pd.DataFrame(final_merged_df,columns=final_merged_df_cols)
final_merged_df.shape
final_merged_df.columns
final_merged_df.head(3)
final_merged_df=pd.get_dummies(final_merged_df,drop_first=True,columns=['ingredient_type'])
final_merged_df.shape
final_merged_df.columns
final_merged_df.drop(['date'],axis=1,inplace=True)
final_merged_df.head(2)
final_merged_df.shape
final_merged_df.columns
final_merged_df_train=final_merged_df.copy()
del [final_merged_df2,final_merged_df_sc]
y_train_full = final_merged_df['yield']
X_train_full = final_merged_df.copy().drop("yield",axis=1)
X_train,X_val,y_train,y_val=train_test_split(X_train_full,y_train_full,test_size=.09,random_state=123)
del [X_train_full,y_train_full]
print(X_train.shape,X_val.shape,y_train.shape,y_val.shape)
lm = DecisionTreeRegressor(min_samples_split=25, max_features='auto')
lm.fit(X_train,y_train)
y_train_pred_lm_be = lm.predict(X_train)
y_val_pred_lm_be = lm.predict(X_val)
print("=================================================================================")
print("Train RMSE of Decision Tree:",np.sqrt(mean_squared_error(y_train, y_train_pred_lm_be)))
print("Validation RMSE of Decision Tree :",np.sqrt(mean_squared_error(y_val, y_val_pred_lm_be)))
plot_learning_curves(X_train, y_train, X_val, y_val, lm, scoring='mean_squared_error')
del [farm3_weather3_df,farm_comp,farm_data3,farm_data33,farm_data333]
del [train_data3,train_data33]
farm_data=pd.read_csv('/content/drive/My Drive/farm_data.csv')
test_weather=pd.read_csv('/content/drive/My Drive/test_weather.csv')
test_data=pd.read_csv('/content/drive/My Drive/test_data.csv')
test_data3=test_data.copy()
test_data3.date = pd.to_datetime(test_data3.date, infer_datetime_format=True)
test_data3.farm_id=test_data3.farm_id.astype('category')
test_data3.ingredient_type=test_data3.ingredient_type.astype('category')
test_weather3=test_weather.copy()
test_weather3.timestamp = pd.to_datetime(test_weather3.timestamp, infer_datetime_format=True)
test_weather3.deidentified_location = test_weather3.deidentified_location.astype('category')
farm_data3=farm_data.copy()
farm_data3.deidentified_location = farm_data3.deidentified_location.astype('category')
farm_data3.farm_id = farm_data3.farm_id.astype('category')
farm_data3.farming_company = farm_data3.farming_company.astype('category')
test_weather3['temp_obs']=test_weather3['temp_obs'].astype(np.float16)
test_weather3['cloudiness']=test_weather3['cloudiness'].astype(np.float16)
test_weather3['wind_direction']=test_weather3['wind_direction'].astype(np.float16)
test_weather3['dew_temp']=test_weather3['dew_temp'].astype(np.float16)
test_weather3['pressure_sea_level']=test_weather3['pressure_sea_level'].astype(np.float16)
test_weather3['precipitation']=test_weather3['precipitation'].astype(np.float16)
test_weather3['wind_speed']=test_weather3['wind_speed'].astype(np.float16)
farm_data3['operations_commencing_year']=farm_data3['operations_commencing_year'].astype(np.float16)
farm_data3['num_processing_plants']=farm_data3['num_processing_plants'].astype(np.float16)
farm_data3['farm_area']=farm_data3['farm_area'].astype(np.float32)
dup_fids=list(farm_data3[farm_data3['farm_id'].duplicated()]['farm_id'])
test_data3_grouped=test_data3.groupby(['farm_id','date','ingredient_type'])['id'].size().unstack()
test_data3_grouped2=test_data3_grouped[(test_data3_grouped['ing_w']>1)|(test_data3_grouped['ing_x']>1)|(test_data3_grouped['ing_y']>1)|\
(test_data3_grouped['ing_z']>1)]
test_data3_grouped2.columns=[str(x) for x in list(test_data3_grouped2.columns)]
test_data3_grouped2=my_reset_index(test_data3_grouped2)
test_data3_grouped2.head()
mul_rec_fids=list(np.unique(test_data3_grouped2['farm_id']))
#### Separated the duplicated farm id records
test_data3_dups=test_data3[test_data3['farm_id'].isin(mul_rec_fids)]
test_data3_dups=test_data3_dups.sort_values(['farm_id','date','ingredient_type','id'])
test3_dups_farm3_dups_df=test_data3_dups.merge(farm_data3[farm_data3['farm_id'].isin(dup_fids)].sort_values('farm_id'),left_on='farm_id',right_on='farm_id')
#We want to view the crops that are present in a paricular farm and where the farm is located
dup_farm_loc_list=list(test3_dups_farm3_dups_df.groupby(['farm_id','deidentified_location','farming_company','ingredient_type'])['ingredient_type'].size().index)
res = get_dict_list(dup_farm_loc_list,4)
res2={}
res3={}
for key,values in res.items():
res2[key]=get_dict_list(values,3)
print('---------------------------------------------------------------------------------------------')
for k,v in res2.items():
print(k)
for k2,v2 in v.items():
l=[]
for ing in v2:
l+=list(ing)
l1=np.unique(l[0::2])
l2=np.unique(l[1::2])
#print(k,)
print(' ',l2,'--->',k2,'--->',l1)
print('---------------------------------------------------------------------------------------------')
farm_data33=farm_data3[farm_data3['farm_id'].isin(dup_fids)].sort_values('farm_id')
farm_data33['ings']=['ing_w ing_x','dups_ing_w ing_y','dups_ing_w','ing_w','dups_ing_w ing_y ing_x','ing_w','dups_ing_w','ing_w',\
'dups_ing_w','ing_w','ing_w ing_x','dups_ing_w','ing_w','dups_ing_w','ing_w','dups_ing_w','ing_w ing_x ing_y',\
'dups_ing_w dups_ing_x ing_z','ing_w ing_x ing_z','dups_ing_w','dups_ing_w','ing_w','dups_ing_w','ing_w',\
'ing_w ing_x ing_z','dups_ing_w','dups_ing_w','ing_w','ing_w ing_x','dups_ing_w']
farm_data333=farm_data33.sort_values(['farm_id','ings'],ascending=False)
indexes2={}
for fid in mul_rec_fids:
# taking row indexes of farms data from test dataset, all indexes excluding the duplicate record indexes
indexes2['nondup-'+fid]=list(test_data3_dups[(test_data3_dups['farm_id']==fid)&~(test_data3_dups.loc[:,['farm_id','date','ingredient_type']].duplicated())].index)
# taking row indexes of farms data from test dataset, only the duplicate record indexes
indexes2['dup-'+fid]=list(test_data3_dups[(test_data3_dups['farm_id']==fid)&(test_data3_dups.loc[:,['farm_id','date','ingredient_type']].duplicated())].index)
# from the above farm_data333 dataframe taking the non-dups_ings information for a particular farm id that are along with the dups_ings
# If there are no non-dups_ings for a farm id, we get empty list. Example: for fid_97094 we get empty, for fid_26064 we get
# ['ing_y', 'ing_x'] as we have dups_ings i.e dups_ing_w along with them
dups_loc_non_dups=[i for i in list(farm_data333[farm_data333['farm_id']==fid].tail(1)['ings'].values)[0].split() if 'dups_' not in i]
# taking row indexes of farms data from test dataset, all indexes of the non-dups_ings extracted above specific to
# a particular farm
if len(dups_loc_non_dups):
indexes2['dupslocnondup-'+fid]=list(test_data3_dups[(test_data3_dups['farm_id']==fid)&~(test_data3_dups.loc[:,['farm_id','date','ingredient_type']].duplicated())&(test_data3_dups['ingredient_type'].isin(dups_loc_non_dups))].index)
test_data33=test_data3.copy()
#element is a variable to control the below flow of code
element=1
for key,idx_value in indexes2.items():
fid=key.split('-')[1]
if (key.split('-')[0]!='dupslocnondup')&(element==3):
# if there are no dupslocnondup- for a file id making element = 1 to control the below execution flow
element=1
if(element==1):
# this code is for nondup- indexes
# Extracting farming_company and deidentified_location particular a farm_id
farm_comp=farm_data333[farm_data333['farm_id']==fid].head(1)['farming_company'].values[0]
deid_loc=farm_data333[farm_data333['farm_id']==fid].head(1)['deidentified_location'].values[0]
test_data33.loc[(test_data33.index.isin(idx_value)),'farming_company']=farm_comp
test_data33.loc[(test_data33.index.isin(idx_value)),'deidentified_location']=deid_loc
prev_idx_value=idx_value
element=2
elif element==2:
# this code is for dup- indexes
farm_comp=farm_data333[farm_data333['farm_id']==fid].tail(1)['farming_company'].values[0]
deid_loc=farm_data333[farm_data333['farm_id']==fid].tail(1)['deidentified_location'].values[0]
test_data33.loc[(test_data33.index.isin(idx_value)),'farming_company']=farm_comp
test_data33.loc[(test_data33.index.isin(idx_value)),'deidentified_location']=deid_loc
element=3
else:
# this code is for dupslocnondup- indexes
farm_comp=farm_data333[farm_data333['farm_id']==fid].tail(1)['farming_company'].values[0]
deid_loc=farm_data333[farm_data333['farm_id']==fid].tail(1)['deidentified_location'].values[0]
test_data33.loc[(test_data33.index.isin(idx_value)),'farming_company']=farm_comp
test_data33.loc[(test_data33.index.isin(idx_value)),'deidentified_location']=deid_loc
element=1
test_data33.farming_company=test_data33.farming_company.astype('category')
test_data33.deidentified_location=test_data33.deidentified_location.astype('category')
test_data33_nondups=test_data33[~(test_data33['farm_id'].isin(mul_rec_fids))]
test_data33_nondups_farm3_df=test_data33_nondups.merge(farm_data3,left_on=['farm_id'],right_on=['farm_id'])
test_data33_nondups_farm3_df=test_data33_nondups_farm3_df.loc[:,['date', 'farm_id', 'ingredient_type', 'id', 'farming_company_y',\
'deidentified_location_y']]
test_data33_nondups_farm3_df.columns=['date', 'farm_id', 'ingredient_type', 'id', 'farming_company','deidentified_location']
test_data_final=pd.concat([test_data33_nondups_farm3_df,test_data33[test_data33['farm_id'].isin(mul_rec_fids)]],axis=0)
test_data_final=test_data_final.sort_values(['farm_id','date','ingredient_type','id'])
test_data_final.farming_company=test_data_final.farming_company.astype('category')
test_data_final.deidentified_location=test_data_final.deidentified_location.astype('category')
farm3_weather3_df=farm_data3.merge(test_weather3,left_on=['deidentified_location'],right_on=['deidentified_location'])
final_merged_df=test_data_final.merge(farm3_weather3_df,how='left',left_on=['farm_id','date','farming_company','deidentified_location'],\
right_on=['farm_id','timestamp','farming_company','deidentified_location'])
final_merged_df['wind_speed']=final_merged_df['wind_speed'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['wind_direction']=final_merged_df['wind_direction'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['temp_obs']=final_merged_df['temp_obs'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['pressure_sea_level']=final_merged_df['pressure_sea_level'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['precipitation']=final_merged_df['precipitation'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['farm_area']=final_merged_df['farm_area'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['dew_temp']=final_merged_df['dew_temp'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['cloudiness']=final_merged_df['cloudiness'].interpolate(method ='linear', limit_direction ='forward')
final_merged_df['precipitation']=final_merged_df['precipitation'].interpolate(method ='linear', limit_direction ='backward')
final_merged_df.drop(['num_processing_plants'],axis=1,inplace=True)
final_merged_df.drop(['operations_commencing_year'],axis=1,inplace=True)
final_merged_df.drop(['timestamp'],axis=1,inplace=True)
final_merged_df.columns
final_merged_df['day']=pd.DatetimeIndex(final_merged_df['date']).day
final_merged_df['day']=final_merged_df['day'].astype(np.uint8)
final_merged_df['month']=pd.DatetimeIndex(final_merged_df['date']).month
final_merged_df['month']=final_merged_df['month'].astype(np.uint8)
final_merged_df['hour']=final_merged_df['date'].dt.hour
final_merged_df['hour']=final_merged_df['hour'].astype(np.uint8)
final_merged_df_viz=final_merged_df.copy()
final_merged_df.shape
final_merged_df_cols=list(final_merged_df.columns)
mappings=[{'col':'farming_company','mapping':fc_list_oe_dict}
]
oe=OrdinalEncoder(mapping=mappings)
final_merged_df=oe.fit_transform(final_merged_df)
final_merged_df=pd.DataFrame(final_merged_df,columns=final_merged_df_cols)
mappings=[{'col':'deidentified_location','mapping':deid_list_oe_dict}
]
oe=OrdinalEncoder(mapping=mappings)
final_merged_df=oe.fit_transform(final_merged_df)
final_merged_df=pd.DataFrame(final_merged_df,columns=final_merged_df_cols)
mappings=[{'col':'farm_id','mapping':fid_list_oe_dict}
]
oe=OrdinalEncoder(mapping=mappings)
final_merged_df=oe.fit_transform(final_merged_df)
final_merged_df=pd.DataFrame(final_merged_df,columns=final_merged_df_cols)
final_merged_df.shape
final_merged_df=pd.get_dummies(final_merged_df,drop_first=True,columns=['ingredient_type'])
final_merged_df.shape
final_merged_df.columns
final_merged_df.head(2)
del[ test_data33_nondups_farm3_df,test_data3_dups]
del[test3_dups_farm3_dups_df,test_data3_grouped,test_data3_grouped2]
del[farm_data,farm_data33,farm_data333]
final_merged_df.shape
final_merged_df.columns
final_merged_df.head(2)
X_test=final_merged_df.copy()
X_test.drop(['date','id'],axis=1,inplace=True)
X_test.shape
X_test.head(5)
y_test_pred=lm.predict(X_test)
np.min(y_test_pred)
np.max(y_test_pred)
X_test['yield']=y_test_pred
X_test.columns
Below is the forecasted demand for the next year for ‘ing_w’, using the model you built, create a sourcing strategy for ‘ing_w’ such that the following conditions hold: ● You should source the ingredient at least a month before ● The ingredient has a long shelf life, capable of being used for at least 8 months once stocked
supply_2016_december_ing_w=pd.DataFrame(final_merged_df_train[(final_merged_df_train['ingredient_type_ing_x']==0)\
&(final_merged_df_train['ingredient_type_ing_y']==0)\
&(final_merged_df_train['ingredient_type_ing_z']==0)\
&(final_merged_df_train['month']==12)].\
groupby(['month'])['yield'].sum())
supply_2016_december_ing_w
supply_df=pd.DataFrame(X_test[(X_test['ingredient_type_ing_x']==0)&(X_test['ingredient_type_ing_y']==0)&(X_test['ingredient_type_ing_z']==0)].groupby(['month'])['yield'].sum())
month=list(range(1,13))
month
demand=[90430481,75344853,91917783,82591524,87419669,297500000,198900000,97367031,91049286,91755300,86533266,88514610]
demand_df=pd.DataFrame(month,columns=['month'])
demand_df['demand']=demand
demand_df.set_index(['month'],inplace=True)
supply_df['yield']=supply_df['yield'].astype(np.int64)
supply_df['Yield Available']=list(np.int64(supply_2016_december_ing_w['yield']))+list(supply_df['yield'])[0:11]
supply_df.columns=['Yield Produced','Yield Available']
supply_df
sup_dem_df=pd.DataFrame(supply_df['Yield Available']-demand_df['demand'],columns=['Yield Available-demand'])
sup_dem_df['Excess % of Yield Available']=(supply_df['Yield Available']-demand_df['demand'])/(demand_df['demand'])*100
supply_df.merge(demand_df,left_index=True,right_index=True).merge(sup_dem_df,left_index=True,right_index=True)
supply_df.merge(demand_df,left_index=True,right_index=True).plot()
plt.title('Month wise Total Yield - Demand Graph for ing_w')
plt.show()
supply_df.merge(demand_df,left_index=True,right_index=True).plot.bar()
plt.title('Month wise Total Yield - Demand Graph for ing_w')
plt.show()
sup_dem_df['Excess % of Yield Available'].plot.bar()
plt.title('Percent Difference of Yield Available and Demand')
plt.show()
final_merged_df_viz.head(2)
final_merged_df_viz.shape
y_test_pred.shape
final_merged_df_viz['yield']=y_test_pred
fig, axes = plt.subplots(nrows=8, ncols=4, figsize=(17,22))
for ing_idx,ing in enumerate(['ing_w','ing_x','ing_y','ing_z']):
temp_dff=pd.DataFrame(final_merged_df_viz[final_merged_df_viz['ingredient_type']==ing]).\
groupby(['deidentified_location','month'])['yield'].sum().unstack().T
temp_dff.columns=[str(x) for x in list(temp_dff.columns)]
temp_dff=my_reset_index(temp_dff)
fig.tight_layout()
count=0
axes[count][ing_idx].set_title(ing,fontsize=15)
for idx,col in enumerate(temp_dff.columns[1:]):
axes[count][ing_idx].plot('month',col,data=temp_dff)
show='n'
if (idx+1)%2==0:
axes[count][ing_idx].legend()
show='y'
axes[count][ing_idx].set_xlabel('Month')
axes[count][ing_idx].set_ylabel('Total Yeild (tons)')
count+=1
if show=='n':
axes[count][ing_idx].legend()
axes[count][ing_idx].set_xlabel('Month')
axes[count][ing_idx].set_ylabel('Total Yeild (tons)')
fig, axes = plt.subplots(nrows=8, ncols=4, figsize=(17,22))
for ing_idx,ing in enumerate(['ing_w','ing_x','ing_y','ing_z']):
temp_dff=pd.DataFrame(final_merged_df_viz[final_merged_df_viz['ingredient_type']==ing]).\
groupby(['farming_company','month'])['yield'].sum().unstack().T
temp_dff.columns=[str(x) for x in list(temp_dff.columns)]
temp_dff=my_reset_index(temp_dff)
fig.tight_layout()
count=0
axes[count][ing_idx].set_title(ing,fontsize=15)
for idx,col in enumerate(temp_dff.columns[1:]):
axes[count][ing_idx].plot('month',col,data=temp_dff)
show='n'
if (idx+1)%2==0:
axes[count][ing_idx].legend()
show='y'
axes[count][ing_idx].set_xlabel('Month')
axes[count][ing_idx].set_ylabel('Total Yeild (tons)')
count+=1
if show=='n':
axes[count][ing_idx].legend()
axes[count][ing_idx].set_xlabel('Month')
axes[count][ing_idx].set_ylabel('Total Yeild (tons)')
deidentified_locations
final_merged_df_viz.head(2)
final_merged_df_train.head(2)
import matplotlib.pyplot as plt
months=np.unique(final_merged_df['month'])
for loc in deidentified_locations:
fig = plt.figure(figsize=(10,5))
host = fig.add_subplot(111)
par1 = host.twinx()
host.set_xlabel(str.upper("MONTH"),fontsize=12)
host.set_ylabel(str.upper("Train Total Yield (tons)"),fontsize=12)
par1.set_ylabel(str.upper("Test Total Yield (tons)"),fontsize=12)
yield1=pd.DataFrame(final_merged_df_train[(final_merged_df_train['deidentified_location']==deid_list_oe_dict[loc])&\
(final_merged_df_train['ingredient_type_ing_x']==0)&(final_merged_df_train['ingredient_type_ing_y']==0)&\
(final_merged_df_train['ingredient_type_ing_z']==0)].groupby(['month'])['yield'].sum())
yield2=pd.DataFrame(final_merged_df_viz[(final_merged_df_viz['deidentified_location']==loc)&(final_merged_df_viz['ingredient_type']=='ing_w')].\
groupby(['month'])['yield'].sum())
p1, = host.plot(months,yield1,color='r',label="Train Total Yield (tons)",linewidth=3)
p2, = par1.plot(months,yield2,color='b',label="Test Total Yield (tons)",linewidth=2)
lns = [p1, p2]
host.legend(handles=lns, loc='center left',bbox_to_anchor=(1.3, 0.5))
host.yaxis.label.set_color(p1.get_color())
par1.yaxis.label.set_color(p2.get_color())
plt.title('Month wise Train Vs Test Total Yield for "ing_w" in '+loc,fontsize=15)
plt.show()
output = pd.DataFrame({ 'id' : final_merged_df['id'], 'yield': y_test_pred })
output.dtypes
print('\nuint16 max=',np.iinfo(np.uint16).max,'\nuint16 min=',np.iinfo(np.uint16).min)
output['yield']=output['yield'].astype(np.float32)
output['id']=output['id'].astype(np.uint32)
np.max(output['yield'])
output.dtypes
output.head()
output=output.sort_values(['id'])
output.shape
output.to_csv('submission.csv', index=False)